# ==============================================================================
# Replication Script for:
# "My Own Private Idaho: A Survey of Separatist Attitudes in the Pacific Northwest"
# Authors: Semir Dzebo, Erin K. Jenne, and Levente Littvay
# ==============================================================================

# Load required packages
library(tidyverse)
library(ggplot2)
library(car)
library(psych)
library(survey)
library(fdrtool)
library(FDRestimation)

# Set working directory (modify as needed)
# setwd("your/path/here")

# ==============================================================================
# 1. Data Import and Initial Processing
# ==============================================================================

# Import data
df <- read_csv("replication_data.csv")

# Recode missing values
df <- df %>%
  mutate_all(~ ifelse(. %in% c(98, 99), NA, .))

# Recode gender
df$gender <- ifelse(df$gender > 2, NA, df$gender)
df <- df %>%
  mutate(gender = recode_factor(gender, "1" = "Male", "2" = "Female")) %>%
  mutate(gender = ifelse(gender == "Male", 0, 1)) %>%
  rename(female = gender)

# Recode party identification
df <- df %>%
  mutate(pid_self = recode_factor(pid_self, 
                                  `1` = "Republican", 
                                  `2` = "Democrat", 
                                  `3` = "Independent", 
                                  `4` = "SomethingElse"))

# Create binary partisanship variable
df <- df %>%
  mutate(pid_forced_binary = case_when(
    pid_self == "Republican" | pid_closer == 1 | pid_issues == 2 ~ "Republican",
    pid_self == "Democrat" | pid_closer == 2 | pid_issues == 1 ~ "Democrat",
    TRUE ~ "Other"
  )) %>%
  mutate(pid_forced_binary = as.factor(pid_forced_binary))

# Ensure numeric variables
df <- df %>%
  mutate_at(vars(2:4, 15:65, 67:72), as.numeric)

# Reverse code specified variables
recode_vars <- c(
  "ID2r", "ID5r", "ID6r", "F4r", "F5r", "F6r", 
  "E2r", "E3r", "E4r", "IR1r", "IR3r", "IR5r", 
  "IB2r", "IB5r", "IB6r", "NIS2r", "NIS4r", 
  "NIS6r", "EN2r", "EN4r", "EN5r", "RTD1r", 
  "RTD3r", "RTD5r"
)

df <- df %>%
  mutate(across(all_of(recode_vars), ~ case_when(
    . == 1 ~ 5,
    . == 2 ~ 4,
    . == 3 ~ 3,
    . == 4 ~ 2,
    . == 5 ~ 1,
    TRUE ~ .
  )))

# ==============================================================================
# 2. Construct Composite Measures
# ==============================================================================

# Calculate composite scores
df <- df %>%
  mutate(
    identity = rowMeans(select(., c("ID1", "ID2r", "ID3", "ID4", "ID5r", "ID6r")), na.rm = TRUE),
    fear = rowMeans(select(., c("F1", "F2", "F3", "F4r", "F5r", "F6r")), na.rm = TRUE),
    economy = rowMeans(select(., c("E1", "E2r", "E3r", "E4r", "E5", "E6")), na.rm = TRUE),
    intergroup_relations = rowMeans(select(., c("IR1r", "IR2", "IR3r", "IR4", "IR5r", "IR6")), na.rm = TRUE),
    ingroup_bias = rowMeans(select(., c("IB1", "IB2r", "IB3", "IB4", "IB5r", "IB6r")), na.rm = TRUE),
    identity_subversion = rowMeans(select(., c("NIS1", "NIS2r", "NIS3", "NIS4r", "NIS5", "NIS6r")), na.rm = TRUE),
    entitativity = rowMeans(select(., c("EN1", "EN2r", "EN3", "EN4r", "EN5r", "EN6")), na.rm = TRUE),
    dissent = rowMeans(select(., c("RTD1r", "RTD2", "RTD3r", "RTD4", "RTD5r", "RTD6")), na.rm = TRUE)
  )

# Create dummy variables
df$nonwhite <- ifelse(df$race == 1, 0, 1)
pid_self_dummies <- model.matrix(~ pid_self - 1, data = df)
pid_forced_dummies <- model.matrix(~ pid_forced_binary - 1, data = df)
df <- cbind(df, pid_self_dummies, pid_forced_dummies)

# ==============================================================================
# 3. Variable Standardization
# ==============================================================================

# Define independent variables
independent_vars <- c("identity", "fear", "economy", "intergroup_relations", 
                      "ingroup_bias", "identity_subversion", "entitativity", "dissent",
                      "age_group", "education", "income", "nonwhite",
                      "pid_selfDemocrat", "pid_selfRepublican", "pid_selfIndependent",
                      "pid_selfSomethingElse", "pid_forced_binaryDemocrat",
                      "pid_forced_binaryRepublican")

# Standardize variables
df_standardized <- df %>%
  mutate(across(all_of(independent_vars), 
                ~ (.-min(., na.rm = TRUE)) / 
                  (max(., na.rm = TRUE) - min(., na.rm = TRUE))))

# ==============================================================================
# 4. Regression Analysis
# ==============================================================================

# Baseline model
model1 <- lm(DV ~ 1 + identity + entitativity + ingroup_bias + identity_subversion + 
               dissent + fear + intergroup_relations + economy + 
               age_group + female + education + income + nonwhite + partial_counties +
               pid_forced_binaryRepublican, data = df_standardized)

# Gender-weighted model
# Calculate weights based on population vs. sample proportions
womenwt <- 0.49/0.68  # Population proportion / Sample proportion for women
menwt <- 0.51/0.32    # Population proportion / Sample proportion for men

# Add weight variable to dataset
df_standardized$genwt <- ifelse(df_standardized$female == 0, menwt, womenwt)

# Create survey design object for weighted regression
wtdf <- svydesign(ids = ~1, 
                  weights = ~genwt, 
                  data = df_standardized)

# Weighted regression model
model2 <- svyglm(DV ~ 1 + identity + entitativity + ingroup_bias + identity_subversion + 
                   dissent + fear + intergroup_relations + economy + 
                   age_group + female + education + income + nonwhite + partial_counties +
                   pid_forced_binaryRepublican,
                 design = wtdf,
                 family = gaussian(link = "identity"))

# Alternative specifications
# Model without education
model3 <- update(model1, . ~ . - education)

# Model without income
model4 <- update(model1, . ~ . - income)

# Model with detailed partisanship (Republican as reference category)
model5 <- update(model1, 
                 . ~ . - pid_forced_binaryRepublican + 
                   pid_selfDemocrat + pid_selfIndependent + 
                   pid_selfSomethingElse)

# Check for multicollinearity in baseline model
vif_values <- vif(model1)

# Store all models in a list for comparison
models_list <- list(
  "Baseline" = model1,
  "Gender Weighted" = model2,
  "No Education" = model3,
  "No Income" = model4,
  "Detailed Party ID" = model5
)

# Optional: Extract and compare results
# summary_stats <- lapply(models_list, summary)

# ==============================================================================
# 5. Multiple Comparison Corrections
# ==============================================================================

# Extract coefficient statistics from baseline model
coef_summary <- summary(model1)$coefficients[-1,]
coefficients <- coef_summary[, "Estimate"]
std_errors <- coef_summary[, "Std. Error"]
p_values <- coef_summary[, "Pr(>|t|)"]

# Apply Benjamini-Hochberg and Storey FDR corrections
bh_results <- p.fdr(p_values, threshold = 0.10)
bh_adjusted_p <- bh_results$`Results Matrix`[,"Adjusted p-values"]
storey_results <- fdrtool(p_values, statistic = "pvalue", plot = FALSE)
storey_q_values <- storey_results$qval

# Calculate rejected hypotheses counts for FCR adjustments
R_10 <- sum(storey_q_values <= 0.10)  # Rejections at q ≤ 0.10
R_05 <- sum(storey_q_values <= 0.05)  # Rejections at q ≤ 0.05
m <- length(p_values)  # Total hypotheses

# Calculate confidence levels and critical values
degrees_freedom <- model1$df.residual
critical_value_95 <- qt(0.975, degrees_freedom)  # Standard 95% CI

# Calculate FCR adjustments
fcr_conf_level_90 <- 1 - R_10*(0.10/m)  # 90% FCR adjustment
fcr_conf_level_95 <- 1 - R_05*(0.05/m)  # 95% FCR adjustment

# Calculate critical values for intervals
fcr_critical_value_90 <- qt((1 + fcr_conf_level_90)/2, degrees_freedom)
fcr_critical_value_95 <- qt((1 + fcr_conf_level_95)/2, degrees_freedom)

# Calculate confidence intervals
std_ci_95_lower <- coefficients - (critical_value_95 * std_errors)
std_ci_95_upper <- coefficients + (critical_value_95 * std_errors)
fcr_ci_90_lower <- coefficients - (fcr_critical_value_90 * std_errors)
fcr_ci_90_upper <- coefficients + (fcr_critical_value_90 * std_errors)
fcr_ci_95_lower <- coefficients - (fcr_critical_value_95 * std_errors)
fcr_ci_95_upper <- coefficients + (fcr_critical_value_95 * std_errors)

# ==============================================================================
# 6. Visualization of Main Results (Figure 1)
# ==============================================================================

# Create results dataframe with all necessary components
results_df <- data.frame(
  Variable = names(coefficients),
  Coefficient = coefficients,
  CI_lower_95_uncorrected = std_ci_95_lower,
  CI_upper_95_uncorrected = std_ci_95_upper,
  CI_lower_90_fcr = fcr_ci_90_lower,
  CI_upper_90_fcr = fcr_ci_90_upper,
  CI_lower_95_fcr = fcr_ci_95_lower,
  CI_upper_95_fcr = fcr_ci_95_upper
)

# Define variable labels with theoretically appropriate construct names
variable_labels <- c(
  identity = "Regional identification",
  entitativity = "Entitativity",
  ingroup_bias = "Ingroup bias",
  identity_subversion = "Identity subversion",
  dissent = "Right to dissent",
  fear = "Threat perception",
  intergroup_relations = "Intergroup relations",
  economy = "Perceived economic grievances",
  age_group = "Age group",
  female = "Female",
  education = "Education",
  income = "Income",
  nonwhite = "Non-white",
  partial_counties = "Partial counties",
  pid_forced_binaryRepublican = "Leaning Republican"
)

# Add variable labels and categories
results_df$Variable <- variable_labels[results_df$Variable]
results_df$category <- case_when(
  results_df$Variable %in% c("Regional identification", "Entitativity", "Ingroup bias", 
                             "Identity subversion", "Right to dissent") ~ "Identity theories",
  results_df$Variable %in% c("Threat perception", "Intergroup relations") ~ "Fears theories",
  results_df$Variable == "Perceived economic grievances" ~ "Economic theories",
  TRUE ~ "Control variables"
)

# Define desired order of variables for visualization
desired_order <- c(
  "Partial counties", "Leaning Republican", "Non-white", "Income", 
  "Education", "Female", "Age group", "Perceived economic grievances",
  "Intergroup relations", "Threat perception", "Right to dissent", 
  "Identity subversion", "Ingroup bias", "Entitativity", 
  "Regional identification"
)

# Create visualization with unadjusted and corrected confidence intervals
p <- ggplot(results_df, aes(x = Coefficient, y = forcats::fct_relevel(Variable, desired_order))) + 
  geom_vline(xintercept = 0, color = "black", linetype = "dashed") +
  
  # Unadjusted 95% CI
  geom_errorbarh(aes(xmin = CI_lower_95_uncorrected, xmax = CI_upper_95_uncorrected,
                     linetype = "Unadjusted 95%"),
                 height = 0, size = 1.2, color = "black") +
  
  # Multiple testing-adjusted CIs
  geom_errorbarh(aes(xmin = CI_lower_90_fcr, xmax = CI_upper_90_fcr,
                     linetype = "Multiple testing-adjusted 90%"),
                 height = 0, size = 0.8, color = "black") +
  geom_errorbarh(aes(xmin = CI_lower_95_fcr, xmax = CI_upper_95_fcr,
                     linetype = "Multiple testing-adjusted 95%"),
                 height = 0, size = 0.3, color = "black") +
  
  # Point estimates
  geom_point(aes(shape = category),
             fill = "black", color = "black", size = 3) +
  
  # Shape aesthetics
  scale_shape_manual(
    name = "Variable Categories",
    values = c("Control variables" = 21,
               "Economic theories" = 22,
               "Fears theories" = 23,
               "Identity theories" = 24),
    limits = c("Identity theories",
               "Fears theories",
               "Economic theories",
               "Control variables"),
    guide = guide_legend(title.position = "top",
                         direction = "vertical")
  ) +
  
  # Linetype aesthetics
  scale_linetype_manual(
    name = "Confidence Intervals",
    limits = c("Unadjusted 95%",
               "Multiple testing-adjusted 90%",
               "Multiple testing-adjusted 95%"),
    values = c("Unadjusted 95%" = "solid",
               "Multiple testing-adjusted 90%" = "solid",
               "Multiple testing-adjusted 95%" = "solid"),
    guide = guide_legend(
      override.aes = list(
        linetype = rep("solid", 3),
        size = c(1.2, 0.8, 0.3)
      ),
      title.position = "top",
      nrow = 1
    )
  ) +
  
  # Labels and theme
  labs(
    x = "Standardized Coefficient Estimate",
    y = ""
  ) +
  theme_minimal() +
  theme(
    axis.title.y = element_text(size = 12, margin = margin(r = 10)),
    axis.title.x = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.text = element_text(size = 10),
    legend.position = "bottom",
    legend.box = "horizontal",
    legend.spacing.x = unit(1, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.box.spacing = unit(0.1, "cm"),
    legend.margin = margin(0, 0, 0, 0),
    legend.box.margin = margin(0, 0, 0, 0),
    panel.grid.minor = element_blank()
  ) +
  guides(
    shape = guide_legend(order = 1, position = "right"),
    linetype = guide_legend(order = 2, position = "bottom")
  )

# Save the figure as a tiff file
ggsave(
  filename = "figure1_separatism_highres.tiff",
  plot = p,
  device = "tiff",
  width = 11.02,
  height = 5.08,
  dpi = 1200,
  compression = "lzw",
  units = "in",
  bg = "white"
)

# ==============================================================================
# 7.Census Comparison (Table B.1) and Visualization of DV Distribution (Figure B.1)
# ==============================================================================

# Define census benchmarks
census_benchmarks <- data.frame(
  Characteristic = c("65+ years old", "Female", "White alone", 
                     "BA degree or higher", "Median household income"),
  Census = c(21, 49, 91, 27, 68469.34),
  stringsAsFactors = FALSE
)

# Calculate demographic percentages using original df
pct_65plus <- round(mean(df$age_group == 7, na.rm = TRUE) * 100, 0)
pct_female <- round(mean(df$female == 1, na.rm = TRUE) * 100, 0)
pct_white <- round(mean(df$nonwhite == 0, na.rm = TRUE) * 100, 0)
pct_ba_plus <- round(mean(df$education >= 5, na.rm = TRUE) * 100, 0)

# Set income category based on median
income_category <- median(df$income) 
income_category <- "$50,000--$74,999" # Median of 3 corresponds to this range

# Combine sample statistics
sample_stats <- data.frame(
  Characteristic = census_benchmarks$Characteristic,
  Sample = c(
    pct_65plus,
    pct_female,
    pct_white,
    pct_ba_plus,
    NA
  )
)

# Create comparison table 
comparison_table <- census_benchmarks %>%
  left_join(sample_stats, by = "Characteristic") %>%
  mutate(
    Census = case_when(
      Characteristic == "Median household income" ~ sprintf("$%.2f", Census),
      TRUE ~ paste0(round(Census, 0), "%")
    ),
    Sample = case_when(
      Characteristic == "Median household income" ~ income_category,
      TRUE ~ paste0(round(Sample, 0), "%")
    )
  ) %>%
  select(Characteristic, Census, Sample)

# Print comparison table
print(knitr::kable(
  comparison_table %>%
    select(Characteristic, Census, Sample),
  col.names = c("Characteristic", "Census (2022)", "Sample"),
  caption = "Comparison of Census and Sample Demographics"
))

# Calculate DV distribution using original df
dv_distribution <- as.data.frame(table(df$DV)) %>%
  mutate(
    category = as.numeric(Var1),
    percentage = round(Freq / sum(Freq) * 100, 1)
  )

# Calculate DV distribution
dv_distribution <- as.data.frame(table(df_standardized$DV)) %>%
  mutate(
    category = as.numeric(Var1),
    n = Freq,
    percentage = round(Freq / sum(Freq) * 100, 2)
  )

# Create the distribution plot
p_descriptive <- ggplot(dv_distribution, 
                        aes(x = category, y = percentage)) +
  geom_bar(stat = "identity", 
           fill = "grey80",
           color = "black", 
           width = 0.3) +
  scale_x_continuous(breaks = 1:5) +
  scale_y_continuous(
    labels = function(x) paste0(x, "%"),
    limits = c(0, 60),
    breaks = seq(0, 60, by = 20)
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 10)
  )

# Save visualization
ggsave(
  filename = "figure_b1_dv_distribution.tiff",
  plot = p_descriptive,
  device = "tiff",
  width = 4,
  height = 3,
  dpi = 1200,
  compression = "lzw",
  units = "in",
  bg = "white"
)

# ==============================================================================
# 8. Generate Table Data
# ==============================================================================

# Calculate descriptive statistics (Table 2)
descriptive_stats <- df_standardized %>%
  select(DV, identity, entitativity, ingroup_bias, identity_subversion,
         dissent, fear, intergroup_relations, economy,
         age_group, female, education, income, nonwhite, partial_counties,
         pid_forced_binaryRepublican, pid_selfDemocrat, pid_selfIndependent,
         pid_selfSomethingElse) %>%
  describe() %>%
  select(mean, sd, se) %>%
  round(2)

# Calculate multiple testing corrections results (Table C.1)
mc_table <- data.frame(
  Variable = names(coefficients),
  Coefficient = coefficients,
  SE = std_errors,
  Raw_p = p_values,
  BH_p = bh_adjusted_p,
  Storey_q = storey_q_values,
  CI_95 = paste0("[", round(std_ci_95_lower, 3), ", ", round(std_ci_95_upper, 3), "]"),
  FCR_90 = paste0("[", round(fcr_ci_90_lower, 3), ", ", round(fcr_ci_90_upper, 3), "]"),
  FCR_95 = paste0("[", round(fcr_ci_95_lower, 3), ", ", round(fcr_ci_95_upper, 3), "]")
)






